IBM Advanced Datascience Capstone Project - Soil Moisture Prediction
Objective - Predict Soil Moisture from past observations which includes ground and air temperature, relative humidity, wind speed
import numpy as np
import pandas as pd
import io
import requests
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
Data Source - Data was obtained from the Library of United States Department of Agriculture
• Dataset includes hourly hydro-meteorological variables including soil moisture, air temperature and relative humidity from 11 sites in Reynolds Creek in southwestern Idaho
Two Datasets were retrieved.
weatherurl="https://data.nal.usda.gov/system/files/weather_data_jdt2b.csv"
s=requests.get(weatherurl).content
weatherData=pd.read_csv(io.StringIO(s.decode('utf-8')))
weatherData.head()
soilmoistureurl="https://data.nal.usda.gov/system/files/rc.tg_.dc_.jd-jdt2b_stm_0.csv"
s=requests.get(soilmoistureurl).content
soilmoistureData=pd.read_csv(io.StringIO(s.decode('utf-8')))
soilmoistureData.head()
#Data source documentation indicates null values are denoted by 9999
# Lets remove those
weatherClean = weatherData[(weatherData[['T_a','RH','e_a','T_d','w_s','w_d']] != -9999.000000).all(axis=1)]
#Data source documentation indicates null values are denoted by 9999
# # Lets remove those
soilMoistureClean = soilmoistureData[(soilmoistureData[['T_g_5','T_g_20','T_g_35','T_g_50','T_g_75','s_m_5','s_m_20','s_m_35','s_m_50','s_m_75']] != -9999.000000).all(axis=1)]
#Merge Data with Date being the common column
soilWeatherData = soilMoistureClean.merge(weatherClean,how="inner", on="Date_time")
soilWeatherData.head()
#Drop columns not needed and are duplicate
soilWeatherData.dtypes
# Date, Time & Hour is not required
soilWeatherConcise = soilWeatherData.filter(['T_g_5','T_g_20','T_g_35','T_g_50','T_g_75','s_m_5','s_m_20','s_m_35','s_m_50','s_m_75','T_a','RH','e_a','T_d','w_s','w_d'],axis=1)
soilWeatherConcise.head()
soilWeatherConcise.shape
There are little over 27k observartions with 16 attributes
soilWeatherConcise.describe()
#Data Visualization
Pairs plots help explore distributions and relationships betwen dependent and independent variables. Seaborn package is great in visualization of data. A pairs plot gives a comprehensive first look at our data set and a great starting point for the data analysis of soil mositure prediction project
# Seaborn visualization library
import seaborn as sns
# Create the default pairplot
sns.pairplot(soilWeatherConcise)
corr = soilWeatherConcise.corr()
#Plot figsize
fig, ax = plt.subplots(figsize=(10, 10))
#Generate Color Map
colormap = sns.diverging_palette(220, 10, as_cmap=True)
#Generate Heat Map, allow annotations and place floats in map
sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f")
#Apply xticks
plt.xticks(range(len(corr.columns)), corr.columns);
#Apply yticks
plt.yticks(range(len(corr.columns)), corr.columns)
#show plot
plt.show()
Temperature and Soil moisture has a good correlation, especially at 35 and 20 cm depth.
y = soilWeatherConcise.pop('s_m_35')
X = soilWeatherConcise
#split the data into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
#normalize data
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
X_train= min_max_scaler.fit_transform(X_train)
X_test = min_max_scaler.transform(X_test)
def applyModel(regressor):
regressor.fit(X_train,y_train)
y_pred=regressor.predict(X_test)
print(regressor.score(X_test,y_test))
return y_pred
def printMetrics(y_pred):
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
from math import sqrt
print("R2 score is", r2_score(y_test,y_pred))
y_pred_pd = pd.DataFrame(y_pred,columns=['s_m_35'])
#Calculate individual metrics
mean_squared_error = mean_squared_error(y_test, y_pred_pd)
mean_absolute_error = mean_absolute_error(y_test,y_pred_pd)
print("Root Mean Squared Error = ",sqrt(mean_squared_error) )
print("Mean Absolute Error = ",mean_absolute_error)
def plotResults(y_pred_pd):
fig, ax = plt.subplots()
ax.scatter(y_test, y_pred_pd)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured Soil Moisture(%)')
ax.set_ylabel('Predicted Soil Moisture(%)')
ax.set_title("Soil Moisture at 35cm from Ground level")
plt.show()
from sklearn.svm import SVR
print("SVM Regressor with Kernet set to linear")
# regressor with kernel set to linear
regressor=SVR(kernel='linear',degree=3)
y_pred_Lin1 = applyModel(regressor)
y_pred_pd = pd.DataFrame(y_pred_Lin1,columns=['s_m_35'])
printMetrics(y_pred_Lin1)
plotResults(y_pred_pd)
print("SVM Regressor with Kernet set to RBF and epsilon set to 0.1")
# regressor with kernel set to rbf
regressor=SVR(kernel='rbf',epsilon=0.1)
y_pred_RBF1 = applyModel(regressor)
y_pred_pd = pd.DataFrame(y_pred_RBF1,columns=['s_m_35'])
printMetrics(y_pred_RBF1)
plotResults(y_pred_pd)
# regressor with kernel set to poly and epsilon = 0.1
print("SVM Regressor with Kernet set to poly and epsilon set to 0.1")
regressor=SVR(kernel='poly',epsilon=0.1)
y_pred_Poly1 = applyModel(regressor)
y_pred_pd = pd.DataFrame(y_pred_Poly1,columns=['s_m_35'])
printMetrics(y_pred_Poly1)
plotResults(y_pred_pd)
from sklearn.svm import SVR
print("SVM Regressor with Kernet set to linear and epsilon set to 0.01")
# regressor with kernel set to linear
regressor=SVR(kernel='linear',degree=3, epsilon=0.01)
y_pred_Lin2 = applyModel(regressor)
y_pred_pd = pd.DataFrame(y_pred_Lin2,columns=['s_m_35'])
printMetrics(y_pred_Lin2)
plotResults(y_pred_pd)
# regressor with kernel set to rbf and epsilon = 0.01
print("SVM Regressor with Kernet set to RBF and epsilon set to 0.01")
regressor=SVR(kernel='rbf',epsilon=0.01)
y_pred_RBF2 = applyModel(regressor)
y_pred_pd = pd.DataFrame(y_pred_RBF2,columns=['s_m_35'])
printMetrics(y_pred_RBF2)
plotResults(y_pred_pd)
# regressor with kernel set to rbf and epsilon = 0.01
print("SVM Regressor with Kernel set to POLY and epsilon set to 0.01")
regressor=SVR(kernel='poly',epsilon=0.01)
y_pred_poly2 = applyModel(regressor)
y_pred_pd = pd.DataFrame(y_pred_poly2,columns=['s_m_35'])
printMetrics(y_pred_poly2)
plotResults(y_pred_pd)
# Lets try Neural Networks
from keras.models import Sequential
from keras.layers import Dense
# create model
model = Sequential()
model.add(Dense(64, input_dim=15, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))
# Compile model
batch_size = 100
model.compile(loss='mse', optimizer='adam')
#train model
model.fit(X_train,y_train,epochs=100,batch_size=batch_size)
# make a prediction
ypredNN = model.predict(X_test)
y_pred_pd = pd.DataFrame(ypredNN,columns=['s_m_35'])
printMetrics(ypredNN)
plotResults(y_pred_pd)
Conclusion - Neural networks perform better than Support Vector Regression. I am hopeful with further tuning of hyper parameteres in SVR, it would also perform in par with Neual Network for regression.